To predict the likelihood of a customer defaulting of a loan, based on customer EFTOS money transactions,
#Basic libraries
import pandas as pd
import numpy as np
import datetime
#Graphic libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#Quick EDA
import pandas_profiling
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2
import plotly.graph_objects as go
import seaborn as sns
import catboost
import category_encoders as ce
#### Classifiers class
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.metrics import f1_score, roc_curve, roc_auc_score, precision_recall_curve, auc
from sklearn.metrics import confusion_matrix, mean_absolute_error,accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
import plotly.express as px
# import sklearn
# sklearn.metrics.SCORERS.keys()
import shap
shap.initjs()
customer_df = pd.read_csv('../data/customer_start_terminate.csv', parse_dates=['startDate', 'firstTransactionDate', 'terminatedDate'])
transaction_df = pd.read_csv('../data/transactions_customer.csv', parse_dates=['date'])
customer_df['terminatedDate'] = pd.to_datetime(customer_df['terminatedDate'].dt.date)
customer_df['Tenure'] = (customer_df.terminatedDate - customer_df.startDate).astype('timedelta64[D]')
# transaction_df['year'] = transaction_df.date.dt.year
# transaction_df['month'] = transaction_df.date.dt.month
# transaction_df = transaction_df.groupby(['customer_id', 'industry', 'location', ''])
data_df = transaction_df.merge(customer_df, on='customer_id', how='left')
data_df['year'] = data_df.date.dt.year
data_df['month'] = data_df.date.dt.month
display(customer_df.head(), transaction_df.head())
display(data_df.head())
print('No of customers in customer_df: {}'.format(len(customer_df.customer_id.unique())))
print('No of customers in transaction_df: {}'.format(len(transaction_df.customer_id.unique())))
print('No of customers in data_df: {}'.format(len(data_df.customer_id.unique())))
transaction_df.columns
_ = customer_df.sort_values('startDate').groupby(['startDate']).size().reset_index()
_.rename(columns={0:'Count'}, inplace=True)
px.line(
_, x="startDate", y="Count", template="plotly_white",
labels=dict(Count="Counts"),
width=800, height=400, title="startDate"
)
_ = customer_df.sort_values('terminatedDate').groupby(['terminatedDate']).size().reset_index()
_.rename(columns={0:'Count'}, inplace=True)
fig = px.line(
_, x="terminatedDate", y="Count", template="plotly_white",
labels=dict(Count="Counts"),
width=800, height=400, title="endDate, 780 terminated, 521 in 13-02-07"
)
fig.show()
fig = px.histogram(customer_df, x="Tenure", marginal="rug", title='Tenure max=3990, min=1162 days')
fig.show()
fig = px.line(transaction_df, x="date", y="daily_average_amount", color='customer_id', hover_data=['location', 'industry'])
fig.update_layout(showlegend=False)
fig.show()
x = '23755432da68528f115c9633c0d7834f'
_ = data_df[data_df.terminatedDate.isnull()][data_df.customer_id!=x]
fig = px.line(_, x="date", y="monthly_amount", color='customer_id', title='monthly amount')
fig.update_layout(showlegend=False)
fig.show()
# The hyposthisis is that customers who terminated contrats are defaulted customers (class 1).
data_df['class'] = ~data_df.terminatedDate.isnull()
data_df['class'] = data_df['class'].astype(int)
lags = 5
final_df = data_df.copy()
final_df = final_df.sort_values(by='date')
_ = final_df.groupby('customer_id')
for l in range(1, lags):
final_df['monthly_amount_{}'.format(l)] = _['monthly_amount'].shift(l)
final_df['daily_average_amount_{}'.format(l)] = _['daily_average_amount'].shift(l)
final_df['daily_sd_amount_{}'.format(l)] = _['daily_sd_amount'].shift(l)
final_df = final_df.groupby('customer_id').tail(1)
print('No of customers in data_df: {}'.format(len(final_df.customer_id.unique())))
final_df.columns
per_customer = data_df.groupby(['customer_id', 'class'])['daily_sd_amount','monthly_amount', 'daily_average_amount', 'Tenure'].mean().reset_index()
fig = px.histogram(per_customer, x="monthly_amount", marginal="rug", hover_data=per_customer.columns, color="class")
fig.show()
features = final_df.columns.difference(['date', 'customer_id', 'startDate',
'firstTransactionDate', 'terminatedDate', 'Tenure', 'startOffset',
'endOffset'])
cat_features = ['industry', 'location']
final_df = final_df[features].dropna()
hot_encoder = ce.OneHotEncoder(cols=cat_features)
final_df = hot_encoder.fit_transform(final_df)
_ = final_df['class'].value_counts().to_frame("Count").reset_index().dropna()
_['index'] = _['index'].replace(True, 'Defaulted').replace(False, 'Non Defaulted')
_ = _.rename(columns={'index':'class'})
px.bar(
_, x="class", y="Count", template="plotly_white",
labels=dict(Count="Counts"),
width=800, height=400, title="Number of Defaulting Customers", color='class'
)
from sklearn.model_selection import train_test_split
def random_create_data(df, trgt='class'):
X = df[df.columns.difference([trgt])]
y = df[trgt]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
return X_train, X_test, y_train, y_test
class Classifier(object):
classifier = {'LogisticRegression':LogisticRegression(),
'XGBoost':xgb.XGBClassifier(),
'SVM': svm.SVC(),
'RandomForestClassifier': RandomForestClassifier()}
#Classifier initial parameters
clfr_init_param = {'LogisticRegression':{'random_state':42,
'max_iter':5
},
'XGBoost': {'objective':'binary:logistic',
'max_delta_step': 1,
'random_state':42,
'min_child_weight':1},
'SVM': {'kernel': 'rbf',
'class_weight':'balanced',
'random_state':42},
'RandomForestClassifier': {'random_state':42}
}
#Classifier tunning parameters
clfr_tune_param = {'LogisticRegression':{'C':[0.005, 0.01, 0.1, 1.],
'penalty':['l1', 'l2'],
'class_weight':[{1:10,0:1},{1:1,0:1}, 'balanced']
},
'SVM': {'C':[0.005, 0.01, 0.1],
'gamma':[0.001, 0.01, 0.1, 1]},
'RandomForestClassifier': {'bootstrap': [True, False],
'max_depth': [10, 20, 30],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 600, 800, 1000, 1200]},
'XGBoost':{
'learning_rate': (0.001, 0.1, 'log-uniform'),
'max_depth': (2, 10),
'subsample': (0.1, 1.0, 'uniform'),
'colsample_bytree': (0.1, 1.0, 'uniform'),
'n_estimators': (100, 2000),
'gamma': (0, 10., 'uniform')
}
}
def __init__(self, name):
self.name = name
self.tune_param = {'clsfr__'+k: v for k,v in self.clfr_tune_param[name].items()}
self.model = self.classifier[name].set_params(**self.clfr_init_param[name])
#Plot class
class Plot:
def __init__(self):
pass
@staticmethod
def plot_auc(y_test, y_pred_prob):
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.show()
auc_score = roc_auc_score(y_test, y_pred_prob)
print('AUC: %.3f' % auc_score)
@staticmethod
def plot_precision_recall(y_test, y_pred_prob):
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[:-1], "b--", label="Precision")
plt.plot(thresholds, recall[:-1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])
plt.show()
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve')
plt.show()
@staticmethod
def confusion_matrix(y_true, y_pred):
print("Classification Report: \n", classification_report(y_true=y_true, y_pred=y_pred))
accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
print("\nAccuracy",accuracy)
conf_mat = confusion_matrix(y_true=y_true, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)
labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
#Cross validation
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(
n_splits=5,
shuffle=True,
random_state=42
)
class HyperTuning(object):
def __init__(self):
pass
@staticmethod
def grid_search(name, X_train, y_train):
clfr = Classifier(name)
pipe = Pipeline([('clsfr', clfr.model)])
grid = GridSearchCV(pipe, cv=cv,
param_grid=clfr.tune_param,
iid=True,
scoring='f1_weighted',
verbose=3,
n_jobs=-1,
refit=True
)
grid.fit(X_train, y_train)
print('Finished GridSearch')
return grid
@staticmethod
def baysian_search(name, X_train, y_train):
clfr = Classifier(name)
pipe = Pipeline([('clsfr', clfr.model)])
opt = BayesSearchCV(
estimator=pipe,
scoring='f1',
search_spaces = [(clfr.tune_param, 5)], # (parameter space, # of evaluations)
cv=cv,
verbose = True,
n_jobs = -1,
refit = True,
random_state = 42
)
opt.fit(X_train, y_train)
print('Finished Training')
return opt
@staticmethod
def evaluate_opt(opt, X, y):
print('Best model\n')
print(opt.best_estimator_)
print('\n')
print('Best score on the validation: {}'.format(opt.best_score_))
result_df = pd.DataFrame(opt.cv_results_)\
.loc[:,['mean_test_score', 'rank_test_score', 'params']].sort_values(by='rank_test_score')
print(result_df.head(5))
print("*******************Training************************")
print("Best. score: %s" % opt.best_score_)
y_pred_prob = opt.predict_proba(X['train'])[:,1]
y_pred = opt.predict(X['train'])
Plot.confusion_matrix(y['train'], y_pred)
##################################
print("*****************Testing***************************")
y_pred_prob = opt.predict_proba(X['test'])[:,1]
y_pred = opt.predict(X['test'])
Plot.plot_auc(y_test, y_pred_prob)
Plot.confusion_matrix(y['test'], y_pred)
Plot.plot_precision_recall(y['test'], y_pred_prob)
%time
name = 'XGBoost'
X_train, X_test, y_train, y_test = random_create_data(final_df, trgt='class')
X = {'train': X_train, 'test':X_test}
y = {'train': y_train, 'test':y_test}
opt = HyperTuning.baysian_search(name, X['train'], y['train'])
HyperTuning.evaluate_opt(opt, X, y)
import shap
shap.initjs()
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(opt.best_estimator_['clsfr'])
shap_values = explainer.shap_values(X_train)
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])
# summarize the effects of all the features
shap.summary_plot(shap_values, X_train)